{ "cells": [ { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import sys\n", "from os import path as p" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [], "source": [ "polyglot_dir = '/data/polyglot/'\n", "\n", "if polyglot_dir not in sys.path:\n", " sys.path.insert(0, polyglot_dir)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The autoreload extension is already loaded. To reload it, use:\n", " %reload_ext autoreload\n" ] } ], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import polyglot\n", "from polyglot.text import Text" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Download packages" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[polyglot_data] Downloading package sentiment2.en to\n", "[polyglot_data] /home/rmyeid/polyglot_data...\n", "[polyglot_data] Package sentiment2.en is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from polyglot.downloader import download, list_packages, _downloader\n", "download(info_or_id=u\"sentiment2.en\")" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[,\n", " ,\n", " ,\n", " ,\n", " ]" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bla = _downloader._collections[\"en\"]\n", "bla.children" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using default data directory (/home/rmyeid/polyglot_data)\n", "=========================================\n", " Data server index for \n", "=========================================\n", "Collections:\n", " [P] lang:af............. Afrikaans packages and models\n", " [P] lang:als............ als packages and models\n", " [P] lang:am............. Amharic packages and models\n", " [P] lang:an............. Aragonese packages and models\n", " [P] lang:ar............. Arabic packages and models\n", " [P] lang:arz............ arz packages and models\n", " [P] lang:as............. Assamese packages and models\n", " [P] lang:ast............ Asturian packages and models\n", " [P] lang:az............. Azerbaijani packages and models\n", " [P] lang:ba............. Bashkir packages and models\n", " [P] lang:bar............ bar packages and models\n", " [P] lang:be............. Belarusian packages and models\n", " [P] lang:bg............. Bulgarian packages and models\n", " [P] lang:bn............. Bengali packages and models\n", " [P] lang:bo............. Tibetan packages and models\n", " [P] lang:bpy............ bpy packages and models\n", " [P] lang:br............. Breton packages and models\n", " [P] lang:bs............. Bosnian packages and models\n", " [P] lang:ca............. Catalan packages and models\n", " [P] lang:ce............. Chechen packages and models\n", " [P] lang:ceb............ Cebuano packages and models\n", " [P] lang:cs............. Czech packages and models\n", " [P] lang:cv............. Chuvash packages and models\n", " [P] lang:cy............. Welsh packages and models\n", " [P] lang:da............. Danish packages and models\n", " [P] lang:de............. German packages and models\n", " [P] lang:diq............ diq packages and models\n", " [P] lang:dv............. Divehi packages and models\n", " [P] lang:el............. Greek packages and models\n", " [P] lang:en............. English packages and models\n", " [P] lang:eo............. Esperanto packages and models\n", " [P] lang:es............. Spanish packages and models\n", " [P] lang:et............. Estonian packages and models\n", " [P] lang:eu............. Basque packages and models\n", " [P] lang:fa............. Persian packages and models\n", " [P] lang:fi............. Finnish packages and models\n", " [P] lang:fo............. Faroese packages and models\n", " [P] lang:fr............. French packages and models\n", " [P] lang:fy............. Western Frisian packages and models\n", " [P] lang:ga............. Irish packages and models\n", " [P] lang:gan............ gan packages and models\n", " [P] lang:gd............. Scottish Gaelic packages and models\n", " [P] lang:gl............. Galician packages and models\n", " [P] lang:gu............. Gujarati packages and models\n", " [P] lang:gv............. Manx packages and models\n", " [P] lang:he............. Hebrew packages and models\n", " [P] lang:hi............. Hindi packages and models\n", " [P] lang:hif............ hif packages and models\n", " [P] lang:hr............. Croatian packages and models\n", " [P] lang:hsb............ Upper Sorbian packages and models\n", " [P] lang:ht............. Haitian packages and models\n", " [P] lang:hu............. Hungarian packages and models\n", " [P] lang:hy............. Armenian packages and models\n", " [P] lang:ia............. Interlingua packages and models\n", " [P] lang:id............. Indonesian packages and models\n", " [P] lang:ilo............ Iloko packages and models\n", " [P] lang:io............. Ido packages and models\n", " [P] lang:is............. Icelandic packages and models\n", " [P] lang:it............. Italian packages and models\n", " [P] lang:ja............. Japanese packages and models\n", " [P] lang:jv............. Javanese packages and models\n", " [P] lang:ka............. Georgian packages and models\n", " [P] lang:kk............. Kazakh packages and models\n", " [P] lang:km............. Khmer packages and models\n", " [P] lang:kn............. Kannada packages and models\n", " [P] lang:ko............. Korean packages and models\n", " [P] lang:ku............. Kurdish packages and models\n", " [P] lang:ky............. Kyrgyz packages and models\n", " [P] lang:la............. Latin packages and models\n", " [P] lang:lb............. Luxembourgish packages and models\n", " [P] lang:li............. Limburgish packages and models\n", " [P] lang:lmo............ lmo packages and models\n", " [P] lang:lt............. Lithuanian packages and models\n", " [P] lang:lv............. Latvian packages and models\n", " [P] lang:mg............. Malagasy packages and models\n", " [P] lang:mk............. Macedonian packages and models\n", " [P] lang:ml............. Malayalam packages and models\n", " [P] lang:mn............. Mongolian packages and models\n", " [P] lang:mr............. Marathi packages and models\n", " [P] lang:ms............. Malay packages and models\n", " [P] lang:mt............. Maltese packages and models\n", " [P] lang:my............. Burmese packages and models\n", " [P] lang:ne............. Nepali packages and models\n", " [P] lang:nl............. Dutch packages and models\n", " [P] lang:nn............. Norwegian Nynorsk packages and models\n", " [P] lang:no............. Norwegian packages and models\n", " [P] lang:oc............. Occitan packages and models\n", " [P] lang:or............. Oriya packages and models\n", " [P] lang:os............. Ossetic packages and models\n", " [P] lang:pa............. Punjabi packages and models\n", " [P] lang:pam............ Pampanga packages and models\n", " [P] lang:pl............. Polish packages and models\n", " [P] lang:pms............ pms packages and models\n", " [P] lang:ps............. Pashto packages and models\n", " [P] lang:pt............. Portuguese packages and models\n", " [P] lang:qu............. Quechua packages and models\n", " [P] lang:rm............. Romansh packages and models\n", " [P] lang:ro............. Romanian packages and models\n", " [P] lang:ru............. Russian packages and models\n", " [P] lang:sa............. Sanskrit packages and models\n", " [P] lang:sah............ Sakha packages and models\n", " [P] lang:scn............ Sicilian packages and models\n", " [P] lang:sco............ Scots packages and models\n", " [P] lang:se............. Northern Sami packages and models\n", " [P] lang:sh............. Serbo-Croatian packages and models\n", " [P] lang:si............. Sinhala packages and models\n", " [P] lang:sk............. Slovak packages and models\n", " [P] lang:sl............. Slovenian packages and models\n", " [P] lang:sq............. Albanian packages and models\n", " [P] lang:sr............. Serbian packages and models\n", " [P] lang:su............. Sundanese packages and models\n", " [P] lang:sv............. Swedish packages and models\n", " [P] lang:sw............. Swahili packages and models\n", " [P] lang:szl............ szl packages and models\n", " [P] lang:ta............. Tamil packages and models\n", " [P] lang:te............. Telugu packages and models\n", " [P] lang:tg............. Tajik packages and models\n", " [P] lang:th............. Thai packages and models\n", " [P] lang:tk............. Turkmen packages and models\n", " [P] lang:tl............. Tagalog packages and models\n", " [P] lang:tr............. Turkish packages and models\n", " [P] lang:tt............. Tatar packages and models\n", " [P] lang:ug............. Uyghur packages and models\n", " [P] lang:uk............. Ukrainian packages and models\n", " [P] lang:ur............. Urdu packages and models\n", " [P] lang:uz............. Uzbek packages and models\n", " [P] lang:vec............ vec packages and models\n", " [P] lang:vi............. Vietnamese packages and models\n", " [P] lang:vls............ vls packages and models\n", " [P] lang:vo............. Volapük packages and models\n", " [P] lang:wa............. Walloon packages and models\n", " [P] lang:war............ Waray packages and models\n", " [P] lang:yi............. Yiddish packages and models\n", " [P] lang:yo............. Yoruba packages and models\n", " [P] lang:zh............. Chinese packages and models\n", " [ ] lang:zhc............ Chinese Character packages and models\n", " [*] lang:zhw............ zhw packages and models\n", " [ ] task:counts2........ counts2\n", " [P] task:embeddings2.... embeddings2\n", " [P] task:ner2........... ner2\n", " [*] task:sentiment2..... sentiment2\n", " [P] task:tsne2.......... tsne2\n", "\n", "([*] marks installed packages; [P] marks partially installed collections)\n" ] } ], "source": [ "list_packages()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Language Detection" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Text\n", "---------------------------------------- \n", "Bonjour, Mesdames.\n", "\n", "detector:\n", "---------------------------------------- \n", "Language 1: name: French code: fr confidence: 94.0 read bytes: 1204\n", "Language 2: name: un code: un confidence: 0.0 read bytes: 0\n", "Language 3: name: un code: un confidence: 0.0 read bytes: 0\n", "\n", "top language code\n", "---------------------------------------- \n", "fr\n", "\n", "top language name\n", "---------------------------------------- \n", "French\n" ] } ], "source": [ "text = Text(\"Bonjour, Mesdames.\")\n", "print \"Text\\n\", \"-\"*40, \"\\n\", text\n", "detected = text.detected_languages\n", "print\n", "print \"detector:\\n\", \"-\"*40, \"\\n\", detected\n", "print \n", "print \"top language code\\n\", \"-\"*40, \"\\n\", text.language.code\n", "print\n", "print \"top language name\\n\", \"-\"*40, \"\\n\", text.language.name" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tokenization" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Text\n", "---------------------------------------- \n", "Beautiful is better than ugly. Explicit is better than implicit. Simple is better than complex.\n", "\n", "\n", "Words\n", "----------------------------------------\n", "[u'Beautiful', u'is', u'better', u'than', u'ugly', u'.', u'Explicit', u'is', u'better', u'than', u'implicit', u'.', u'Simple', u'is', u'better', u'than', u'complex', u'.']\n", "\n", "Sentences\n", "----------------------------------------\n", "[Sentence(\"Beautiful is better than ugly.\"), Sentence(\"Explicit is better than implicit.\"), Sentence(\"Simple is better than complex.\")]\n" ] } ], "source": [ "zen = Text(\"Beautiful is better than ugly. \"\n", " \"Explicit is better than implicit. \"\n", " \"Simple is better than complex.\")\n", "print \"Text\\n\", \"-\"*40, \"\\n\", zen\n", "detector = zen.language\n", "print\n", "print \"\\nWords\\n\", \"-\"*40\n", "print zen.words\n", "print \"\\nSentences\\n\", \"-\"*40\n", "print zen.sentences" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Polarity" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[polyglot_data] Downloading package sentiment2.en to\n", "[polyglot_data] /home/rmyeid/polyglot_data...\n", "[polyglot_data] Package sentiment2.en is already up-to-date!\n" ] } ], "source": [ "%%bash\n", "polyglot download sentiment2.en" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Word base polarity" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Word Polarity \n", "---------------------------------------- \n", "\n", "Beautiful 0\n", "is 0\n", "better 1\n", "than 0\n", "ugly -1\n", ". 0\n", "Explicit 0\n", "is 0\n", "better 1\n", "than 0\n", "implicit 0\n", ". 0\n", "Simple 0\n", "is 0\n", "better 1\n", "than 0\n", "complex -1\n", ". 0\n" ] } ], "source": [ "print \"{:<16}{}\".format(\"Word\", \"Polarity\"),\"\\n\", \"-\"*40,\"\\n\"\n", "for w in zen.words:\n", " print \"{:<16}{:>2}\".format(w, w.polarity)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sentence Level Sentiment" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Beautiful is better than ugly. 0.0\n", "Explicit is better than implicit. 1.0\n", "Simple is better than complex. 0.0\n" ] } ], "source": [ "for sent in zen.sentences:\n", " print sent, sent.polarity" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Named Entity Extraction" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "zen.entities" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Embeddings" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([ 0.05519063, -0.01371501, 0.4883692 , -0.24165028, 0.15249102,\n", " -0.5495227 , 0.27307254, 0.64203113, 0.54172772, 0.05180147,\n", " -0.45538789, -0.30796388, 0.61745948, -0.41822246, -0.28658321,\n", " 0.74634224, 0.47470608, 0.77453768, 1.19995797, 0.47836885,\n", " -0.22754097, 0.1432631 , -0.19801912, 0.24440986, -0.37574792,\n", " -0.14388466, 0.34778944, -0.39550784, -0.01028192, 0.95838851,\n", " 0.35426503, 0.13478422, 0.05386258, 0.36379546, -0.10879917,\n", " -0.71637553, -0.25026572, 0.07875264, 0.57645911, -0.7738995 ,\n", " 0.52438337, 0.33535531, -0.16611245, 0.43598977, 0.8950882 ,\n", " -0.20549561, 0.3005766 , 0.62948579, -0.28185904, -0.15822442,\n", " 0.59155077, 0.21829523, 0.12933102, -0.07546752, 0.19084625,\n", " -0.45469594, -0.02288984, 0.44011137, 0.10498845, 0.10494279,\n", " 0.22320323, -0.1855296 , -0.03656057, -0.3861219 ], dtype=float32)" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "w = zen.words[5]\n", "w.vector" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }